Load packages
library(data.table)
library(stringr)
library(ggplot2)
theme_py <- theme_light() + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill = NA),
text = element_text(size = 20),
strip.placement = "outside",
strip.text = element_text(size = 20, color = "black"),
strip.background = element_rect(fill = "white")
)
theme_set(theme_py)
library(patchwork)
library(ggrepel)
library(ComplexHeatmap)
library(GenomicRanges)
library(openxlsx)
require(ggraph)
require(igraph)
source("scripts/functions.R")
source("../motif-analysis/mta_downstream_functions.R")
source("../gene-set-analysis/gene-set-analysis.R")
Data directories and metadata
atac_dir <- file.path(
"ATACSEQ", "nucleosome_free_regions", "results"
)
cns_dir <- file.path(
"ATACSEQ", "nucleosome_free_regions", "consensus_peaks"
)
bnd_dir <- file.path(
"ATACSEQ", "nucleosome_free_regions", "footprint", "BINDetect"
)
rna_dir <- file.path(
"RNASEQ_QUANTIFICATION", "results"
)
pub_dir <- file.path("published")
res_dir <- file.path("results")
fig_dir <- file.path("plots")
for (newdir in c(res_dir, fig_dir))
dir.create(newdir, showWarnings = FALSE)
# metadata
des_fn <- file.path("RNASEQ_QUANTIFICATION", "design_table.tsv")
des_dt <- fread(des_fn)
setnames(des_dt, "reporter_line", "reporterline")
col_dt <- des_dt[, .(sample, reporterline, condition)]
col_dt[, reporterline := factor(
reporterline,
levels = c("Elav", "Fox", "Ncol")
)]
col_dt[, condition := factor(condition, levels = c("negative", "positive"))]
setorder(col_dt, reporterline, condition)
# colors
condition_cols <- c("positive" = "blue", "negative" = "red")
line_cols <- c("Elav" = "#ff7f00", "Fox" = "#984ea3", "Ncol" = "#4daf4a")
line_cond_cols <- c(
"Elav_pos" = "#ff7f00", "Fox_pos" = "#984ea3", "Ncol_pos" = "#4daf4a",
"Elav_neg" = "#ebbd8f", "Fox_neg" = "#d18adb", "Ncol_neg" = "#90d18e"
)
euler_cols <- c(
line_cond_cols[grep("pos", names(line_cond_cols))],
c(
"Elav_pos&Ncol_pos" = "#ffce8c",
"Elav_pos&Fox_pos" = "#f1b6ba",
"Fox_pos&Ncol_pos" = "#daffcb",
"Elav_pos&Fox_pos&Ncol_pos" = "#fffdfe"
)
)
Gene annotations and lists
marks_gold <- fread(
file.path("annotation", "golden-marks-231124.tsv"),
fill = TRUE, sep = "\t", header = FALSE
)[, c(2:1)]
setnames(marks_gold, c("gene", "name"))
marks_gold_v <- structure(marks_gold$name, names = marks_gold$gene)
tfs_annt <- fread(
file.path("annotation", "tfs.Nvec.tsv"),
header = TRUE
)[, .SD[1], gene]
setnames(tfs_annt, c("gene", "name", "pfam"))
gene_annt <- fread(
file.path("annotation", "Nvec_annotation_v3_2020-10-23_DToL_names"),
select = 1:3
)
setnames(gene_annt, c("gene", "pfam", "name"))
gene_ann <- rbindlist(list(
gene_annt[!gene %in% tfs_annt$gene],
tfs_annt
), use.names = TRUE)
gene_ann[gene %in% marks_gold$gene, name := marks_gold_v[gene]]
Load normalized gene expression (RNA-seq) and motif accessibility (ATAC-seq) data
# GENES
# size norm gene expression
exps_mt <- read.table(
file.path(rna_dir, "mat_norm.tsv")
)
# size and gene norm gene expression
expr_mt <- read.table(
file.path(rna_dir, "norm_expression.tsv")
)
# gene expression TPMs
tpms_mt <- read.table(
file.path(rna_dir, "tpm_expression.tsv")
)
# gene clusters
gene_clust <- fread(
file.path(rna_dir, "genes_clusters.tsv")
)
# PEAKS
# peak accessibility
accs_mt <- read.table(
file.path(atac_dir, "mat_norm.tsv")
)
# peak to gene assignment
assign <- fread(
file.path(atac_dir, "consensusSeekeR-peaks-gene-assignment.tsv")
)
# peak clusters
pks_clust <- fread(
file.path(atac_dir, "consensusSeekeR-peaks-clusters.bed")
)
bed_cols <- c(
"seqnames", "start", "end", "peak", "score", "strand", "cluster", "group"
)
setnames(pks_clust, bed_cols)
# MOTIFS
# motif enrichment scores
motf_dt <- fread(
file.path(atac_dir, "motif-enrich-archetypes-mona.tsv")
)
setnames(motf_dt, "label", "group")
# motif enrichment qvalue
motf_qv <- read.table(
file.path(atac_dir, "motif-enrich-archetypes-mona-qvalue.tsv"),
sep = "\t"
)
# motif enrichment log2fc
motf_fc <- read.table(
file.path(atac_dir, "motif-enrich-archetypes-mona-fc.tsv"),
sep = "\t"
)
# motif to tf assignment
dc <- fread(file.path(
"annotation",
"assignment-archetype-motif-gene.tsv.gz"
))[, .(gene, archetype_name)]
dc[, motif := str_extract(archetype_name, "ARCH\\d+")]
dc[, archetype_name := NULL]
Load fold changes for expression and motif binding
# binding comparisons fold changes for best correlated motifs
dp_dt <- fread(file.path(
bnd_dir, "bindetect_results_reporterline.txt"
))
dp_dt[, id := paste(reporterline, vs, sep = "_vs_")]
dp_dt[, motif_gene := paste(motif, gene, sep = "__")]
dt_atac <- dcast.data.table(
dp_dt, motif_gene ~ id, value.var = "log2FoldChange"
)
mt_atac <- as.matrix(dt_atac[, -1])
rownames(mt_atac) <- dt_atac$motif_gene
# RNA comparisons fold changes
mt_rna <- read.table(
file.path(rna_dir, "log2fc_expression.tsv"),
sep = "\t",
header = TRUE
)
Dot plot of expression and binding score
dt_rna_plot <- melt.data.table(
as.data.table(mt_rna, keep.rownames = "gene"),
id.vars = "gene", variable.name = "comparison", value.name = "rna_fc"
)
dt_atac_plot <- melt.data.table(
as.data.table(mt_atac, keep.rownames = "motif_gene"),
id.vars = "motif_gene", variable.name = "comparison", value.name = "atac_fc"
)
dt_atac_plot[, gene := str_remove(motif_gene, "ARCH\\d+__")]
dt_atac_plot[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_comb <- merge.data.table(
dt_rna_plot, dt_atac_plot,
by = c("gene", "comparison")
)
# filtering
ids_rna <- rownames(mt_rna)[apply(mt_rna, 1, function(x) max(x) > 1.5)]
ids_atac <- rownames(mt_atac)[apply(mt_atac, 1, function(x) max(x) > 0.1)]
dt_plot <- dt_comb[gene %in% ids_rna & motif_gene %in% ids_atac]
# add gene annotation
dt_plot <- merge.data.table(
dt_plot, gene_ann, by = "gene",
all.x = TRUE, sort = FALSE
)
# label for plotting
dt_plot[, label := paste(
substr(name, 1, 30), gene, motif,
sep = " | "
)]
# clustering
tfs <- unique(dt_plot$gene)
hc <- hclust(dist(mt_rna[tfs, ]), method = "ward.D2")
ord <- tfs[hc$order]
# ordering
tfs <- unique(dt_plot$gene)
mord <- order(apply(mt_rna[tfs,], 1, which.max))
ord <- tfs[mord]
# order genes
dt_plot[, gene := factor(gene, levels = rev(ord))]
setorder(dt_plot, gene)
dt_plot[, label := factor(label, levels = unique(dt_plot$label))]
setorder(dt_plot, label)
# limits for plotting
dt_plot[rna_fc < 0, rna_fc := 0]
dt_plot[rna_fc > 8, rna_fc := 8]
# plot
require(ggplot2)
require(scales)
## Loading required package: scales
gp_dot <- ggplot(dt_plot, aes(comparison, label)) +
geom_point(
aes(size = rna_fc, fill = atac_fc),
shape = 21,
color = "black"
) +
scale_fill_gradientn(
colours = colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(1000),
# limits = c(-5, 5), oob = squish, breaks = c(-5, 0, 5),
name = "motif binding\nlog2(fold change)",
) +
scale_size_continuous(
# range = c(1, 7),
# breaks = c(0, 2, 4, 6),
name = "gene expression\nlog2(fold change)"
) +
theme(
panel.grid.major = element_line(size = 0.25),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust = 1),
legend.direction = "vertical"
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gp_dot
Select correlated genes
mt_bnd <- copy(mt_atac)
rownames(mt_bnd) <- str_remove(rownames(mt_bnd), "ARCH\\d+__")
mt_exp <- mt_rna[rownames(mt_bnd), colnames(mt_bnd)]
rownames(mt_bnd) <- rownames(mt_atac)
rownames(mt_exp) <- rownames(mt_atac)
# correlation between same comparisons (columns)
mt_cors_smp <- cor(mt_bnd, mt_exp, method = "spearman")
diag(mt_cors_smp)
## Elav_vs_Fox Elav_vs_Ncol Elav_vs_neg Fox_vs_Elav Fox_vs_Ncol Fox_vs_neg
## 0.17200287 0.07699029 0.29388865 0.17200287 0.18450950 0.18314571
## Ncol_vs_Elav Ncol_vs_Fox Ncol_vs_neg
## 0.07699029 0.18450950 0.12235286
# correlation between genes (rows)
mt_cors_gen <- cor(t(mt_bnd), t(mt_exp), method = "spearman")
cors_gen <- diag(mt_cors_gen)
names(cors_gen) <- rownames(mt_cors_gen)
cors_gen <- sort(cors_gen, decreasing = TRUE)
# select correlated genes
select_pos <- names(cors_gen[(cors_gen) > 0.5])
# correlation distribution
dt_cors <- data.table(cor = cors_gen)
dt_cors$motif_gene <- names(cors_gen)
dt_cors[, correlation := "no"]
dt_cors[motif_gene %in% select_pos, correlation := "positive"]
dt_cors[, motif := str_extract(motif_gene, "ARCH\\d+")]
dt_cors[, gene := str_remove(motif_gene, "ARCH\\d+__")]
# plot
gp_hist <- ggplot(dt_cors, aes(cor, fill = correlation)) +
geom_histogram(color = "black", binwidth = 0.05) +
scale_y_continuous(expand = expansion(c(0, 0.1))) +
scale_fill_manual(values = c(
"positive" = "green",
"no" = "grey"
)) +
labs(x = "correlation", y = "number of genes") +
theme(legend.position = "none")
gp_hist
Inspect correlations between gene expression and motif binding score per sample
mt_ord <- cbind(mt_exp, mt_bnd)
colnames(mt_ord) <- paste(
colnames(mt_ord),
rep(c("RNA", "ATAC"), each = 9),
sep = "_"
)
mt_dt <- as.data.table(mt_ord, keep.rownames = "motif_gene")
mt_dt[, motif := str_extract(motif_gene, "ARCH\\d+")]
mt_dt[, gene := str_remove(motif_gene, "ARCH\\d+__")]
mt_dt[, motif_gene := NULL]
dt_atac <- melt.data.table(
mt_dt,
id.vars = c("gene", "motif"),
measure.vars = grep("ATAC", colnames(mt_dt), value = TRUE),
value.name = "ATAC", variable.name = "comparison"
)
dt_atac[, comparison := str_remove(comparison, "_ATAC")]
dt_rna <- melt.data.table(
mt_dt,
id.vars = "gene",
measure.vars = grep("RNA", colnames(mt_dt), value = TRUE),
value.name = "RNA", variable.name = "comparison"
)
dt_rna[, comparison := str_remove(comparison, "_RNA")]
dt_comp <- merge.data.table(
dt_rna, dt_atac,
by = c("gene", "comparison"), all = TRUE
)
setcolorder(dt_comp, c("motif", "gene", "ATAC", "RNA"))
dt_comp[, reporterline := str_extract(comparison, "Elav|Fox|Ncol")]
dt_comp[, vs := str_extract(comparison, "vs.*")]
dt_comp[, vs := str_remove(vs, "vs_")]
dt_comp <- merge.data.table(
dt_comp, dt_cors, by = c("motif", "gene"),
all.x = TRUE, sort = FALSE
)
# add gene annotation
dt_comp <- merge.data.table(
dt_comp, gene_ann, by = "gene",
all.x = TRUE, sort = FALSE
)
# save
fwrite(
dt_comp,
file.path(res_dir, "correlation_expression_binding.tsv"),
sep = "\t"
)
# label for plotting
dt_comp[, label := substr(name, 1, 20)]
# posterboys to label
dt_lab <- dt_comp[cor > 0.5 & abs(RNA) > 1 & abs(ATAC) > 0.1]
# plot
gp_comp <- ggplot(dt_comp, aes(RNA, ATAC, color = correlation)) +
geom_point(data = dt_comp[correlation == "no"], alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE) +
geom_text_repel(
data = dt_lab[ATAC > 0],
aes(label = label),
color = "black",
size = 3,
segment.color = "grey50",
min.segment.length = 0,
nudge_y = 0.5 - dt_lab[ATAC > 0]$ATAC,
) +
geom_text_repel(
data = dt_lab[ATAC < 0],
aes(label = label),
color = "black",
size = 3,
segment.color = "grey50",
min.segment.length = 0,
nudge_y = -0.5 - dt_lab[ATAC < 0]$ATAC,
) +
geom_point(data = dt_comp[correlation != "no"], alpha = 0.8) +
scale_color_manual(values = c(
"positive" = "green",
"no" = "grey"
)) +
scale_y_continuous(expand = c(0.1, 0.1)) +
facet_grid(vs ~ reporterline) +
theme(legend.position = "bottom")
gp_comp
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 25 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Inspect correlation of binding and expression per gene
# binding correlations with expression
dt_comp <- fread(
file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp_cor <- unique(
dt_comp[correlation != "no"][, .(motif, gene, reporterline)]
)
# for individual genes
comp_marks <- dt_comp[gene %in% marks_gold$gene]
comp_marks_gp <- ggplot(comp_marks, aes(ATAC, RNA, color = reporterline, shape = vs)) +
scale_color_manual(values = line_cols) +
scale_shape_manual(values = c("Elav" = 15, "Fox" = 16, "Ncol" = 17, "neg" = 18)) +
ggpubr::stat_cor(aes(group = gene), method = "pearson", label.x = -0.5, label.y = 10, color = "black") +
facet_wrap("name") +
geom_hline(yintercept = 0, linetype = 2, size = 0.5) +
geom_vline(xintercept = 0, linetype = 2, size = 0.5) +
geom_point(size = 2) +
theme(legend.position = "bottom") +
labs(x = "motif binding logFC", y = "expression logFC")
comp_marks_gp
Load bound targets info, select those tfs and target genes that are also expressed in given sample, and add expression correlation info to construct baseline networks.
# binding correlations with expression
dt_comp <- fread(
file.path(res_dir, "correlation_expression_binding.tsv")
)
# binding targets
mo_dt <- fread(
file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
mo_dt <- mo_dt[expressed == TRUE & target_expressed == TRUE & bound == TRUE]
mo_dt <- mo_dt[grep("pos", sample)]
mo_dt[, reporterline := str_remove(sample, "_pos")]
# add TF-target gene expression correlation
g_cors_dt <- unique(mo_dt[, .(gene, target_gene)])
g_cors_dt <- g_cors_dt[gene %in% rownames(tpms_mt)]
g_cors_dt <- g_cors_dt[target_gene %in% rownames(tpms_mt)]
g_cors_dt[, expression_correlation := cor(
unlist(tpms_mt[gene, ]),
unlist(tpms_mt[target_gene, ])
), by = 1:nrow(g_cors_dt)]
mo_dt <- merge.data.table(
mo_dt, g_cors_dt, by = c("gene", "target_gene"),
all.x = TRUE, sort = FALSE
)
mo_dt <- mo_dt[!is.na(expression_correlation)]
mo_dt[, c("expressed", "target_expressed", "bound") := NULL]
# save
fwrite(
mo_dt,
file.path(res_dir, "network.tsv.gz"),
sep = "\t"
)
mo_dt <- fread(file.path(res_dir, "network.tsv.gz"))
# distribution of expression lfc
exp_tf_dt <- unique(mo_dt[, .(sample, gene, expression_lfc, expression_tpm)])
exp_tg_dt <- unique(mo_dt[, .(sample, target_gene, target_expression_lfc, target_expression_tpm)])
setnames(exp_tg_dt, colnames(exp_tg_dt), str_remove(colnames(exp_tg_dt), "target_"))
exp_dist_plot <- function(exp_dt, title = "Expression distribution", lfc_thr = 0) {
gp_scat <- ggplot(
exp_dt,
aes(expression_lfc, expression_tpm, color = sample)
) +
geom_point(alpha = 0.8) +
scale_color_manual(values = line_cond_cols) +
scale_y_log10() +
geom_vline(xintercept = lfc_thr, color = "red", linetype = 2) +
theme(
legend.position = "none",
panel.grid.major = element_line(linewidth = 0.1)
) +
labs(x = "expression logFC", y = "expression TPM")
gp_dist_lfc <- ggplot(
exp_dt,
aes(sample, expression_lfc, fill = sample)
) +
geom_boxplot(aes(color = sample)) +
geom_boxplot(aes(fill = sample), outlier.colour = NA) +
geom_hline(yintercept = lfc_thr, color = "red", linetype = 2) +
scale_fill_manual(values = line_cond_cols) +
scale_color_manual(values = line_cond_cols) +
coord_flip() +
theme(
legend.position = "none",
panel.grid.major = element_line(linewidth = 0.1),
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
gp_dist_tpm <- ggplot(
exp_dt,
aes(sample, expression_tpm)
) +
geom_boxplot(aes(color = sample)) +
geom_boxplot(aes(fill = sample), outlier.colour = NA) +
scale_fill_manual(values = line_cond_cols) +
scale_color_manual(values = line_cond_cols) +
scale_y_log10() +
theme(
legend.position = "none",
panel.grid.major = element_line(linewidth = 0.1),
axis.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
gp_exp <- (gp_dist_lfc + plot_spacer() + gp_scat + gp_dist_tpm) +
plot_layout(
nrow = 2, ncol = 2,
heights = c(1, 3), widths = c(3, 1)
) +
plot_annotation(title = title)
gp_exp
}
gp_tf_exp <- exp_dist_plot(exp_tf_dt, title = "TFs expression")
gp_tf_exp
gp_tg_exp <- exp_dist_plot(exp_tg_dt, title = "Target genes expression")
gp_tg_exp
Filtering by expression
mo_dt <- mo_dt[expression_lfc > 0][target_expression_lfc > 0]
# mo_dt <- mo_dt[expression_tpm > 100][target_expression_tpm > 100]
setcolorder(mo_dt, c(
"sample", "gene", "name", "pfam",
"expression_tpm", "expression_lfc",
"target_gene", "target_name", "target_pfam",
"target_expression_tpm", "target_expression_lfc",
"motif", "motif_score", "footprint_score",
"expression_correlation"
))
# save
fwrite(
mo_dt,
file.path(res_dir, "network_expression.tsv.gz"),
sep = "\t"
)
Footprint score distribution
mo_dt <- fread(
file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
bnd_dt <- unique(
mo_dt[expressed == TRUE & target_expressed == TRUE][, .(
sample, gene, target_gene, motif, expression_lfc, footprint_score, bound
)]
)
bnd_dt[, bound := ifelse(bound, "bound", "not bound")]
bnd_dt[, bound := factor(bound, levels = c("not bound", "bound"))]
gp_bnd_dist <- ggplot(bnd_dt, aes(footprint_score, color = sample, linetype = bound)) +
geom_density(alpha = 0.1) +
geom_vline(xintercept = 4, color = "red", linetype = 2) +
scale_color_manual(values = line_cond_cols) +
scale_linetype_manual(values = c("bound" = 1, "not bound" = 2)) +
scale_fill_manual(values = c("bound" = "grey60", "not bound" = "grey100")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
scale_x_continuous(breaks = c(0, 4, seq(10, 100, 10)), expand = expansion(mult = c(0, 0)), limits = c(NA, 40), oob = scales::squish) +
theme(
legend.position = "bottom", legend.box = "vertical",
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
gp_bnd_boxp <- ggplot(bnd_dt, aes(sample, footprint_score, fill = sample)) +
geom_boxplot(aes(color = sample)) +
geom_boxplot(aes(fill = sample), outlier.colour = NA) +
scale_fill_manual(values = line_cond_cols) +
scale_color_manual(values = line_cond_cols) +
scale_y_continuous(trans = "log10") +
facet_wrap("bound", scales = "free") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "none"
)
gp_bnd_dist
# select correlated motifs
dt_comp <- fread(
file.path(res_dir, "correlation_expression_binding.tsv")
)
dt_comp[, sample := paste(reporterline, "_pos")]
dt_comp_cor <- unique(
dt_comp[correlation != "no"][, .(motif, gene, sample)]
)
mo_dt_cor <- merge.data.table(
dt_comp_cor, mo_dt, by = c("motif", "gene", "sample"), sort = FALSE
)
# save
fwrite(
mo_dt_cor,
file.path(res_dir, "network_correlation.tsv.gz"),
sep = "\t"
)
Save network files
mo_dt <- fread(file.path(res_dir, "network_expression.tsv.gz"))
mo_dt[, reporterline := NULL]
mo_dt[, motif_name := NULL]
mo_dt[, target_TF := ifelse(target_gene %in% .SD$gene, TRUE, FALSE), sample]
mo_dt[, name := substr(name, 1, 30)]
mo_dt[, pfam := substr(pfam, 1, 30)]
mo_dt[, target_name := substr(target_name, 1, 30)]
mo_dt[, target_pfam := substr(target_pfam, 1, 30)]
require(openxlsx)
wb <- createWorkbook()
for (smp in c("Elav", "Ncol", "Fox")) {
smp_fn <- file.path(res_dir, sprintf("network_%s.tsv.gz", smp))
smp_dt <- mo_dt[sample == paste0(smp, "_pos")]
message("Saving: ", smp_fn)
fwrite(smp_dt, smp_fn, sep = "\t")
# all genes
addWorksheet(wb, sheetName = smp)
writeData(wb, sheet = smp, smp_dt)
# TF-TF
tfs_dt <- smp_dt[target_TF == TRUE]
tfs_orph <- smp_dt[!gene %in% tfs_dt$gene, .SD[1], gene]
tfs_orph[, target_gene := "none"]
tfs_orph[, c("target_name", "target_pfam") := ""]
tfs_orph[, c("target_expression_tpm", "target_expression_lfc", "expression_correlation") := 0]
tfs_dt <- rbindlist(list(tfs_dt, tfs_orph), use.names = TRUE)
addWorksheet(wb, sheetName = paste0(smp, "_TF"))
writeData(wb, sheet = paste0(smp, "_TF"), tfs_dt)
}
## Saving: results/network_Elav.tsv.gz
## Saving: results/network_Ncol.tsv.gz
## Saving: results/network_Fox.tsv.gz
saveWorkbook(
wb,
file.path(res_dir, "network.xlsx"),
overwrite = TRUE
)
How many target genes for each TF?
hits_dt <- mo_dt[, .N, .(gene, motif, sample)]
hits_gp <- ggplot(hits_dt, aes(sample, N, fill = sample)) +
ggbeeswarm::geom_beeswarm(shape = 21) +
geom_boxplot(outlier.color = NA, alpha = 0.2) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(values = line_cond_cols) +
labs(y = "target genes per TF") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "none",
panel.grid.major.y = element_line(linewidth = 0.5),
panel.grid.minor.y = element_line(linewidth = 0.2)
) +
geom_text(
data = hits_dt[,.N,sample],
aes(label = N, y = 15)
)
hits_gp
TFs overlapping between the lines
require(eulerr)
## Loading required package: eulerr
mo_dt <- fread(file.path(res_dir, "network_expression.tsv.gz"))
mo_dt[, gene_n_samples := length(unique(.SD$sample)), .(gene, name, pfam)]
gene_ovl_dt <- unique(mo_dt[, .(gene, sample, gene_n_samples)])
gene_ovl_dt <- dcast.data.table(gene_ovl_dt, gene ~ sample, value.var = "gene_n_samples")
gene_ovl_dt <- unique(gene_ovl_dt)
gene_ovl_mt <- as.matrix(gene_ovl_dt[, -1])
gene_ovl_mt[!is.na(gene_ovl_mt)] <- 1
gene_ovl_mt[is.na(gene_ovl_mt)] <- 0
fit <- euler(gene_ovl_mt)
nms <- names(fit$fitted.values)
p1 <- plot(
fit,
quantities = TRUE,
fills = euler_cols[nms],
main = "TFs overlap"
)
print(p1)
For all TFs, compare sets of targets between cell lines
## png
## 2
For all target genes, compare sets of TFs that regulate them between cell lines (i.e. genes upregulated in two or three cell lines, but differently regulated)
mo_dt <- fread(file.path(res_dir, "network_expression.tsv.gz"))
# count number of cell lines where each tf - target gene is present
mo_dt[, n_samples_edge := length(unique(.SD$sample)), .(motif, target_gene)]
mo_dt[, n_samples_gene := length(unique(.SD$sample)), .(motif)]
mo_dt[, n_samples_target := length(unique(.SD$sample)), .(target_gene)]
# per target gene, the max number of samples in which any of its regulatory link is present
mo_dt[, n_samples_edge_max := max(.SD$n_samples_edge), .(target_gene)]
# of the genes that are expressed in more than one cell line, which ones are completely differently regulated?
mo_dif <- unique(
mo_dt[, .(target_gene, target_name, target_pfam, n_samples_target, n_samples_edge_max)]
)[n_samples_target > 1 & n_samples_edge_max==1]
setorder(mo_dif, -n_samples_target)
# plot euler diagram
pdf(
file.path(fig_dir, "target_genes_motifs_novl.pdf"),
width = 7, height = 4
)
for (tg in unique(mo_dif$target_gene)) {
message(tg)
nm <- substr(gene_ann[gene == tg]$name, 1, 25)
pf <- substr(gene_ann[gene == tg]$pfam, 1, 25)
tg_dt <- unique(mo_dt[target_gene == tg][, .(motif, sample, motif_score)])
tg_dt <- dcast.data.table(tg_dt, motif ~ sample, value.var = "motif_score")
tg_dt <- unique(tg_dt)
tg_mt <- as.matrix(tg_dt[, -1])
tg_mt[!is.na(tg_mt)] <- 1
tg_mt[is.na(tg_mt)] <- 0
fit <- euler(tg_mt)
nms <- names(fit$fitted.values)
p1 <- plot(
fit,
quantities = TRUE,
fills = euler_cols[nms],
main = sprintf("\n\n%s\n%s; %s", tg, nm, pf)
)
print(p1)
}
dev.off()
# plot heatmap
pdf(
file.path(fig_dir, "target_genes_motifs_novl_heatmap.pdf"),
width = 12, height = 12
)
for (tg in unique(mo_dif$target_gene)) {
message(tg)
nm <- substr(gene_ann[gene == tg]$name, 1, 35)
pf <- substr(gene_ann[gene == tg]$pfam, 1, 35)
tg_dt <- mo_dt[target_gene==tg]
setorder(tg_dt, sample)
tg_dt[, label := paste(motif, str_remove(gene, "Nvec_(vc1.1_)*"), substr(name, 1, 35), sep = " | ")]
tg_dt[, label := factor(label, levels = unique(label))]
tg_gp <- ggplot(tg_dt, aes(sample, label, fill = footprint_score, size = expression_lfc)) +
#geom_tile(color = "white") +
geom_point(shape = 21) +
scale_size_continuous(range = c(1, 16)) +
scale_y_discrete(position = "right") +
scale_fill_distiller(palette = "RdPu", direction = 1) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
panel.grid.major = element_line(color = "grey", linewidth = 0.2)
) +
labs(
title = sprintf("\n\n%s\n%s; %s", tg, nm, pf),
y = "", x = ""
)
print(tg_gp)
}
dev.off()
# of the genes that are expressed in more than one cell line, which ones are partially differently regulated?
mo_par <- unique(
mo_dt[, .(target_gene, target_name, target_pfam, n_samples_target, n_samples_edge_max)]
)[n_samples_target > 1 & n_samples_edge_max == 3]
setorder(mo_par, -n_samples_target)
# plot euler diagram
pdf(
file.path(fig_dir, "target_genes_motifs_povl.pdf"),
width = 7, height = 4
)
for (tg in unique(mo_par$target_gene)) {
message(tg)
nm <- substr(gene_ann[gene == tg]$name, 1, 25)
pf <- substr(gene_ann[gene == tg]$pfam, 1, 25)
tg_dt <- unique(mo_dt[target_gene == tg][, .(motif, sample, motif_score)])
tg_dt <- dcast.data.table(tg_dt, motif ~ sample, value.var = "motif_score")
tg_dt <- unique(tg_dt)
tg_mt <- as.matrix(tg_dt[, -1])
tg_mt[!is.na(tg_mt)] <- 1
tg_mt[is.na(tg_mt)] <- 0
fit <- euler(tg_mt)
nms <- names(fit$fitted.values)
p1 <- plot(
fit,
quantities = TRUE,
fills = euler_cols[nms],
main = sprintf("\n\n%s\n%s; %s", tg, nm, pf)
)
print(p1)
}
dev.off()
# plot heatmap
pdf(
file.path(fig_dir, "target_genes_motifs_povl_heatmap.pdf"),
width = 12, height = 16
)
for (tg in unique(mo_par$target_gene)) {
message(tg)
nm <- substr(gene_ann[gene == tg]$name, 1, 35)
pf <- substr(gene_ann[gene == tg]$pfam, 1, 35)
tg_dt <- mo_dt[target_gene==tg]
setorder(tg_dt, sample)
tg_dt[, label := paste(motif, str_remove(gene, "Nvec_(vc1.1_)*"), substr(name, 1, 35), sep = " | ")]
tg_dt[, label := factor(label, levels = unique(label))]
tg_gp <- ggplot(tg_dt, aes(sample, label, fill = footprint_score, size = expression_lfc)) +
#geom_tile(color = "white") +
geom_point(shape = 21) +
scale_size_continuous(range = c(1, 16)) +
scale_y_discrete(position = "right") +
scale_fill_distiller(palette = "RdPu", direction = 1) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
panel.grid.major = element_line(color = "grey", linewidth = 0.2)
) +
labs(
title = sprintf("\n\n%s\n%s; %s", tg, nm, pf),
y = "", x = ""
)
print(tg_gp)
}
dev.off()
# all networks
net <- fread(file.path(res_dir, "network_expression.tsv.gz"))
net[, reporterline := str_remove(sample, "_pos")]
# GO annotations
go_annotations <- readRDS(file.path("annotation", "Nvec_ensembl.GO.rds"))
# where to save results
go_dir <- file.path(res_dir, "GO")
dir.create(go_dir)
# enrichment test per reporter line
tfs_dt <- rbindlist(lapply(unique(net$reporterline), function(st) {
message("\n", "Starting GO analysis for ", st, "\n")
net_st <- net[reporterline == st]
# enrichment test per TF
tfs <- unique(net_st[, .N, gene][order(-N)][N > 10]$gene)
tfs_dt <- rbindlist(lapply(tfs, function(tf) {
genes_fg <- unique(net_st[gene == tf]$target_gene)
genes_bg <- unique(net$target_gene)
message(sprintf(
"%s (%s/%s); %s target genes; %s background genes",
tf, match(tf, tfs), length(tfs), length(genes_fg), length(genes_bg)
))
go_enrichment_table_tf <- gsa_topgo_enrichment(
annotation = go_annotations,
genes_fg = genes_fg,
genes_bg = genes_bg,
output_prefix = file.path(go_dir, "GO"),
name_fg = sprintf("%s_%s", st, tf),
ontologyset = c("BP", "MF", "CC"),
tg_test = "fisher",
tg_algorithm = "elim",
top_markers = 30,
nodesize = 10,
printfile = FALSE
)
go_tf <- as.data.table(go_enrichment_table_tf)
go_tf[, gene := tf]
}), fill = TRUE)
tfs_dt[, reporterline := st]
}))
# save
saveRDS(tfs_dt, file.path(res_dir, "go_res_per_tf.rds"))
Plot results
# load PFAM enrichment results
tfs_dt <- readRDS(file.path(res_dir, "go_res_per_tf.rds"))
setnames(tfs_dt, "GO.ID", "go_id")
tfs_dt[, annot := paste(Term, go_id, sep = " | ")]
# add TF gene annotations
tfs_dt <- merge.data.table(
tfs_dt, gene_ann, by = "gene", all.x = TRUE, sort = FALSE
)
for (x in c("Elav", "Ncol", "Fox")) {
rep_dt <- tfs_dt[reporterline == x]
# select significant categories
anns <- unique(rep_dt[pval_test < 0.01]$annot)
if (length(anns) > 50)
anns <- unique(rep_dt[, .(pval_test, annot)])[order(pval_test)]$annot[1:50]
anns_dt <- rep_dt[annot %in% anns]
# cluster categories and genes
anns_dc <- dcast.data.table(anns_dt, gene ~ annot, value.var = "pval_adj")
anns_mt <- as.matrix(anns_dc[, -1])
rownames(anns_mt) <- anns_dc[[1]]
anns_mt[is.na(anns_mt)] <- 1
anns_hc <- hclust(dist(anns_mt), method = "ward.D2")
anns_mt <- anns_mt[anns_hc$order, ]
anns_on <- unique(anns_dt[,.(annot, ontology)])[order(ontology)]
anns_ord <- unlist(tapply(anns_on$annot, anns_on$ontology, function(x) {
anns_hc <- hclust(dist(t(anns_mt[, x])), method = "ward.D2")
x[anns_hc$order]
}), use.names = FALSE)
anns_ord <- as.character(anns_ord)
anns_mt <- anns_mt[, anns_ord]
# order
anns_dt[, gene := factor(gene, levels = rownames(anns_mt))]
anns_dt[, annot := factor(annot, levels = colnames(anns_mt))]
setorder(anns_dt, gene, annot)
anns_dt[, label := paste(substr(name, 1, 30), str_remove(gene, "Nvec_(vc1.1_)*"), sep = " | ")]
anns_dt[, label := factor(label, levels = unique(anns_dt$label))]
# plot
gp_go <- ggplot(anns_dt, aes(label, annot, color = pval_test, size = Significant)) +
#geom_tile(color = "white") +
geom_point() +
scale_size_continuous(range = c(1, 8)) +
scale_color_distiller(palette = "BuPu") +
facet_grid(ontology ~ ., scales = "free", space = "free") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
panel.grid.major = element_line(linewidth = 0.2, color = "grey90")
) +
labs(title = x)
pw <- pmax(16, length(unique(anns_dt$label)) / 3)
ph <- pmax(12, length(unique(anns_dt$annot)) / 2.5)
pdf(
file.path(fig_dir, sprintf("go_annot_%s.pdf", x)),
width = pw, height = ph
)
print(gp_go)
dev.off()
}
# all networks
net <- fread(file.path(res_dir, "network_expression.tsv.gz"))
net[, reporterline := str_remove(sample, "_pos")]
# PFAM annotations
pfam_annotations <- gsa_enrichment_load_pfam_list(
file.path("annotation", "Nvec_long.pep.pfamscan_archs.csv")
)
# where to save results
pfam_dir <- file.path(res_dir, "PFAM")
dir.create(pfam_dir)
# enrichment test per reporter line
tfs_dt <- rbindlist(lapply(unique(net$reporterline), function(st) {
message("\n", "Starting PFAM analysis for ", st, "\n")
net_st <- net[reporterline == st]
# enrichment test per TF
tfs <- unique(net_st[, .N, gene][order(-N)][N > 10]$gene)
tfs_dt <- rbindlist(lapply(tfs, function(tf) {
genes_fg <- unique(net_st[gene == tf]$target_gene)
genes_bg <- unique(net$target_gene)
message(sprintf(
"%s (%s/%s); %s target genes; %s background genes",
tf, match(tf, tfs), length(tfs), length(genes_fg), length(genes_bg)
))
enrichment_table <- gsa_enrichment_hypergeometric(
annotation = pfam_annotations,
genes_fg = genes_fg,
genes_bg = genes_bg,
output_prefix = file.path(pfam_dir, "PFAM"),
name_fg = sprintf("%s_%s", st, tf)
)
pfam_tf <- as.data.table(enrichment_table)
pfam_tf[, gene := tf]
}), fill = TRUE)
tfs_dt[, reporterline := st]
}))
# save
saveRDS(tfs_dt, file.path(res_dir, "pfam_res_per_tf.rds"))
Plot results
# load PFAM enrichment results
tfs_dt <- readRDS(file.path(res_dir, "pfam_res_per_tf.rds"))
# add TF gene annotations
tfs_dt <- merge.data.table(
tfs_dt, gene_ann, by = "gene", all.x = TRUE, sort = FALSE
)
for (x in c("Elav", "Ncol", "Fox")) {
rep_dt <- tfs_dt[reporterline == x]
# select significant categories
anns <- rep_dt[pval_adj < 0.05]$annot
anns_dt <- rep_dt[annot %in% anns]
# cluster categories and genes
anns_dc <- dcast.data.table(anns_dt, gene ~ annot, value.var = "pval_adj")
anns_mt <- as.matrix(anns_dc[, -1])
rownames(anns_mt) <- anns_dc[[1]]
anns_mt[is.na(anns_mt)] <- 1
anns_hc <- hclust(dist(anns_mt), method = "ward.D2")
anns_mt <- anns_mt[anns_hc$order, ]
anns_hc <- hclust(dist(t(anns_mt)), method = "ward.D2")
anns_mt <- anns_mt[, anns_hc$order]
# order
anns_dt[, gene := factor(gene, levels = rownames(anns_mt))]
anns_dt[, annot := factor(annot, levels = colnames(anns_mt))]
setorder(anns_dt, gene, annot)
anns_dt[, label := paste(substr(name, 1, 30), str_remove(gene, "Nvec_(vc1.1_)*"), sep = " | ")]
anns_dt[, label := factor(label, levels = unique(anns_dt$label))]
# plot
gp_pfam <- ggplot(anns_dt, aes(label, annot, color = pval_adj, size = freq_in_fg)) +
#geom_tile(color = "white") +
geom_point() +
scale_size_continuous(range = c(1, 8)) +
scale_color_distiller(palette = "BuPu") +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
panel.grid.major = element_line(linewidth = 0.2, color = "grey90")
) +
labs(title = x)
pw <- pmax(12, length(unique(anns_dt$label)) / 4)
ph <- pmax(12, length(unique(anns_dt$annot)) / 3)
pdf(
file.path(fig_dir, sprintf("pfam_annot_%s.pdf", x)),
width = pw, height = ph
)
print(gp_pfam)
dev.off()
}
Parse data from paper
# load data from paper and translate gene IDs to DToL IDs
pou_act <- readWorkbook(file.path(pub_dir, "elife-74336-supp6-v3.xlsx"), startRow = 2, colNames = TRUE)
pou_rep <- readWorkbook(file.path(pub_dir, "elife-74336-supp7-v3.xlsx"), startRow = 2, colNames = TRUE)
setDT(pou_act)
setDT(pou_rep)
pou_pub <- rbindlist(list(activated = pou_act, repressed = pou_rep), idcol = "regulation")
setnames(pou_pub, "gene", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_pub <- merge.data.table(pou_pub, dict, all.x = TRUE, sort = FALSE)
pou_pub <- pou_pub[, .(gene, regulation, fold_change, pvalue)]
setnames(pou_pub, "gene", "target_gene")
fwrite(
pou_pub,
file.path(pub_dir, "pou4_targets_elife.tsv"),
sep = "\t"
)
Compare targets
# load data
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]
# load data from paper
pou_pub <- fread(file.path(pub_dir, "pou4_targets_elife.tsv"))
# overlap of Pou4 targets
pou_pub_act <- pou_pub[regulation == "activated", .(target_gene, fold_change)]
setnames(pou_pub_act, "fold_change", "activated")
pou_pub_rep <- pou_pub[regulation == "repressed", .(target_gene, fold_change)]
setnames(pou_pub_rep, "fold_change", "repressed")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- merge.data.table(merge.data.table(
pou_pub_act, pou_pub_rep,
by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE)
pou_mt <- pou_dt[, lapply(.SD, function(x) !is.na(x)), .SD = c("activated", "repressed", "network")]
pou_mt[, target_gene := pou_dt$target_gene]
setcolorder(pou_mt, "target_gene")
pou_mt <- unique(pou_mt)
pou_mt <- as.matrix(pou_mt[, -1])
require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
fit,
quantities = TRUE
)
print(p1)
Inspect the differences
# reverse activation and repression sign (mutants)
pou_dt[, published := -1 * activated][is.na(published), published := -1 * repressed]
# sets of target genes
missing_in_network <- unique(pou_dt[!is.na(published) & is.na(network)]$target_gene)
missing_in_network_act <- unique(pou_dt[!is.na(activated) & is.na(network)]$target_gene)
missing_in_network_rep <- unique(pou_dt[!is.na(repressed) & is.na(network)]$target_gene)
missing_in_published <- unique(pou_dt[is.na(published)]$target_gene)
target_intersect <- unique(pou_dt[!is.na(published) & !is.na(network)]$target_gene)
mo_dt <- fread(
file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
target_bound <- unique(mo_dt[sample=="Ncol_pos" & gene == pou4]$target_gene)
# expression for target genes
pou_expression_dt <- unique(ncol_dt[, .(target_gene, target_expression_lfc, motif_score, footprint_score, expression_correlation)])
pou_expression_dt[target_gene %in% missing_in_network_act, annot := "activated"]
pou_expression_dt[target_gene %in% missing_in_network_rep, annot := "repressed"]
pou_expression_dt[target_gene %in% missing_in_published, annot := "network"]
pou_expression_dt[target_gene %in% target_intersect, annot := "intersect"]
pou_expression_dt <- pou_expression_dt[!is.na(annot)]
pou_expression_dt[, bound := ifelse(target_gene %in% target_bound, "binding", "no binding")]
# plot
pou_expression_dt[, annot := factor(
annot, levels = c("network", "intersect", "activated", "repressed")
)]
gp_pou_exp <- ggplot(pou_expression_dt, aes(annot, target_expression_lfc, fill = bound)) +
geom_boxplot() +
# ggbeeswarm::geom_quasirandom() +
geom_boxplot(outlier.color = NA, alpha = 0.5) +
scale_fill_manual(values = c("grey40", "grey80")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "target genes", y = "expression log FC")
gp_pou_exp
Genes in intersect
pou_int <- pou_net[target_gene %in% target_intersect]
pou_int <- unique(merge.data.table(
pou_int, pou_pub[, .(target_gene, regulation)],
by = "target_gene", all.x = TRUE, sort = FALSE
))
pou_int[, target_TF := target_gene %in% tfs_annt$gene]
# plot
gp_pou_int <- ggplot(pou_int, aes(
expression_correlation, target_expression_lfc,
#size = 1/n_samples,
shape = target_TF,
fill = regulation,
label = target_name
)) +
ggrepel::geom_text_repel(color = "black") +
geom_point(color = "grey10") +
scale_size_continuous(
range = c(1, 6),
breaks = seq(1/3, 1, length.out = 3),
labels = seq(3, 1),
name = "target in number of samples\n(Elav, Ncol, Fox)"
) +
scale_shape_manual(
values = c("TRUE" = 22, "FALSE" = 21),
name = "target TF?"
) +
scale_fill_manual(
values = c("activated" = "grey90", "repressed" = "grey50"),
name = "relation"
) +
guides(
fill = guide_legend(override.aes = list(size = 5, shape = 21)),
shape = guide_legend(override.aes = list(size = 5)),
size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
) +
labs(
x = "Pou4 expression correlation",
y = "target gene expression logFC",
title = sprintf("%s target genes in intersect", length(unique(pou_int$target_gene)))
)
gp_pou_int
## Warning: ggrepel: 31 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Parse data from the paper
# load data from paper and translate gene IDs to DToL IDs
pou_mut <- readWorkbook(file.path(pub_dir, "1-s2.0-S2211124720303454-mmc2.xlsx"), startRow = 1, colNames = TRUE)
setDT(pou_mut)
setnames(pou_mut, "ID", "ID_Vienna")
dict <- fread(file.path("annotation", "DICTIONARY_Nvec_vienna_vs_Nvec_old_vs_Nvec_DTOL_v2.txt"))[, .(ID_Vienna, ID_DToL)]
setnames(dict, "ID_DToL", "gene")
pou_mut <- merge.data.table(pou_mut, dict, all.x = TRUE, sort = FALSE)
setnames(
pou_mut,
c("gene", "Regulation", "log2FoldChange"),
c("target_gene", "regulation", "fold_change")
)
pou_mut[regulation == "UP", regulation := "up"]
pou_mut[regulation == "Down", regulation := "down"]
pou_mut <- pou_mut[, .(target_gene, regulation, fold_change, padj)]
fwrite(
pou_mut,
file.path(pub_dir, "pou4_targets_cellrep.tsv"),
sep = "\t"
)
Compare targets
# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]
# load data from paper
pou_mut <- fread(file.path(pub_dir, "pou4_targets_cellrep.tsv"))
# overlap of Pou4 targets
pou_mut_act <- pou_mut[regulation == "up", .(target_gene, fold_change)]
setnames(pou_mut_act, "fold_change", "up")
pou_mut_rep <- pou_mut[regulation == "down", .(target_gene, fold_change)]
setnames(pou_mut_rep, "fold_change", "down")
pou_net_gen <- pou_net[, .(target_gene, target_expression_lfc)]
setnames(pou_net_gen, "target_expression_lfc", "network")
pou_dt <- unique(merge.data.table(merge.data.table(
pou_mut_act, pou_mut_rep,
by = "target_gene", all = TRUE
), pou_net_gen, by = "target_gene", all = TRUE))
pou_mt <- pou_dt[, lapply(.SD, function(x) !is.na(x)), .SD = c("down", "up", "network")]
pou_mt[, target_gene := pou_dt$target_gene]
setcolorder(pou_mt, "target_gene")
pou_mt <- unique(pou_mt)
pou_mt <- as.matrix(pou_mt[, -1])
require(eulerr)
fit <- euler(pou_mt)
p1 <- plot(
fit,
quantities = TRUE
)
print(p1)
Inspect the differences
# reverse activation and repression sign (mutants)
pou_dt[, published := -1 * up][is.na(published), published := -1 * down]
# sets of target genes
missing_in_network <- unique(pou_dt[!is.na(published) & is.na(network)]$target_gene)
missing_in_network_act <- unique(pou_dt[!is.na(down) & is.na(network)]$target_gene)
missing_in_network_rep <- unique(pou_dt[!is.na(up) & is.na(network)]$target_gene)
missing_in_published <- unique(pou_dt[is.na(published)]$target_gene)
target_intersect <- unique(pou_dt[!is.na(published) & !is.na(network)]$target_gene)
mo_dt <- fread(
file.path(atac_dir, "motifs_genes_targets_long.tsv.gz")
)
target_bound <- unique(mo_dt[sample=="Ncol_pos" & gene == pou4]$target_gene)
# expression for target genes
pou_expression_dt <- unique(ncol_dt[, .(target_gene, target_expression_lfc)])
# pou_expression_dt[target_gene %in% missing_in_network, annot := "published"]
pou_expression_dt[target_gene %in% missing_in_network_act, annot := "down"]
pou_expression_dt[target_gene %in% missing_in_network_rep, annot := "up"]
pou_expression_dt[target_gene %in% missing_in_published, annot := "network"]
pou_expression_dt[target_gene %in% target_intersect, annot := "intersect"]
pou_expression_dt <- pou_expression_dt[!is.na(annot)]
pou_expression_dt[, bound := ifelse(target_gene %in% target_bound, "binding", "no binding")]
# plot
pou_expression_dt[, annot := factor(
annot, levels = c("network", "intersect", "down", "up")
)]
gp_pou_exp <- ggplot(pou_expression_dt, aes(annot, target_expression_lfc, fill = bound)) +
geom_boxplot() +
ggbeeswarm::geom_quasirandom() +
geom_boxplot(outlier.color = NA, alpha = 0.5) +
scale_fill_manual(values = c("grey40", "grey80")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "target genes", y = "expression log FC")
Genes in intersect
pou_int <- pou_net[target_gene %in% target_intersect]
pou_int <- unique(merge.data.table(
pou_int, pou_mut[, .(target_gene, regulation)],
by = "target_gene", all.x = TRUE, sort = FALSE
))
pou_int[, target_TF := target_gene %in% tfs_annt$gene]
# plot
gp_pou_int <- ggplot(pou_int, aes(
expression_correlation, target_expression_lfc,
#size = 1/n_samples,
shape = target_TF,
fill = regulation,
label = target_name
)) +
ggrepel::geom_text_repel(color = "black") +
geom_point(color = "grey10") +
scale_size_continuous(
range = c(1, 6),
breaks = seq(1/3, 1, length.out = 3),
labels = seq(3, 1),
name = "target in number of samples\n(Elav, Ncol, Fox)"
) +
scale_shape_manual(
values = c("TRUE" = 22, "FALSE" = 21),
name = "target TF?"
) +
scale_fill_manual(
values = c("down" = "grey90", "up" = "grey50"),
name = "regulation"
) +
guides(
fill = guide_legend(override.aes = list(size = 5, shape = 21)),
shape = guide_legend(override.aes = list(size = 5)),
size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
) +
labs(
x = "Pou4 expression correlation",
y = "target gene expression logFC",
title = sprintf("%s target genes in intersect", length(unique(pou_int$target_gene)))
)
gp_pou_int
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Overlaps
# load published data
pou_cellrep <- fread(file.path(pub_dir, "pou4_targets_cellrep.tsv"))
pou_elife <- fread(file.path(pub_dir, "pou4_targets_elife.tsv"))
# load our Ncol network
pou4 <- "Nvec_vc1.1_XM_032363992.2"
ncol_dt <- fread(file.path(res_dir, "network_Ncol.tsv.gz"))
pou_net <- ncol_dt[gene == pou4]
# overlap of Pou4 targets
pou_dt <- unique(rbindlist(list(
pou_cellrep[, .(target_gene, regulation)],
pou_elife[, .(target_gene, regulation)],
pou_net[, .(target_gene)][, regulation := "network"]
)))
pou_dc <- dcast.data.table(pou_dt, target_gene ~ regulation, value.var = "regulation")
pou_dc <- unique(pou_dc)
pou_mt <- pou_dc[, lapply(.SD, function(x) !is.na(x)), .SD = c("down", "up", "activated", "repressed", "network")]
pou_mt <- as.matrix(pou_mt)
pou_cols <- c(
"down" = "#add8e6ff",
"up" = "#f08080ff",
"activated" = "#3949dbff",
"repressed" = "#de2346ff",
"network" = "#ffecb3ff"
)
require(eulerr)
set.seed(1950)
fit <- euler(pou_mt)
p1 <- plot(
fit,
quantities = TRUE,
fills = pou_cols
)
print(p1)
Genes in intersect
pou_int <- pou_dc[!is.na(network) & (!is.na(activated) | !is.na(repressed)) & (!is.na(up) | !is.na(down))]
pou_int <- pou_net[target_gene %in% pou_int$target_gene]
pou_int <- unique(merge.data.table(
pou_int, pou_elife[, .(target_gene, regulation)],
by = "target_gene", all.x = TRUE, sort = FALSE
))
pou_int[, target_TF := target_gene %in% tfs_annt$gene]
fwrite(
pou_int,
file.path(res_dir, "pou4_target_genes_intersect.tsv"),
sep = "\t"
)
# plot
gp_pou_int <- ggplot(pou_int, aes(
expression_correlation, target_expression_lfc,
#size = 1/n_samples,
shape = target_TF,
fill = regulation,
label = target_name
)) +
ggrepel::geom_text_repel(color = "black") +
geom_point(color = "grey10") +
scale_size_continuous(
range = c(1, 6),
breaks = seq(1/3, 1, length.out = 3),
labels = seq(3, 1),
name = "target in number of samples\n(Elav, Ncol, Fox)"
) +
scale_shape_manual(
values = c("TRUE" = 22, "FALSE" = 21),
name = "target TF?"
) +
scale_fill_manual(
values = c("activated" = "grey90", "repressed" = "grey50"),
name = "regulation"
) +
guides(
fill = guide_legend(override.aes = list(size = 5, shape = 21)),
shape = guide_legend(override.aes = list(size = 5)),
size = guide_legend(override.aes = list(shape = 21, fill = "grey60"))
) +
labs(
x = "Pou4 expression correlation",
y = "target gene expression logFC",
title = sprintf("%s target genes in intersect", length(unique(pou_int$target_gene)))
)
gp_pou_int
## Warning: ggrepel: 23 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Difference
# all genes expressed in Ncol, even if they are not bound by Pou4
mo_dt <- fread(file.path(atac_dir, "motifs_genes_targets_long.tsv.gz"))
all_dt <- mo_dt[sample == "Ncol_pos"][gene == pou4]
oth_dt <- mo_dt[sample == "Ncol_pos"][!target_gene %in% pou_dt$target_gene][, .SD[1], .(target_gene)]
oth_dt[, c("gene", "name", "pfam", "motif", "motif_name", "motif_score", "expression_tpm", "expression_lfc", "expressed", "bound", "footprint_score") := NA]
all_dt <- rbindlist(list(all_dt, oth_dt), use.names = TRUE)
# categories of genes bound by Pou4
pou_dt <- unique(rbindlist(list(
pou_cellrep[, .(target_gene, regulation)],
pou_elife[, .(target_gene, regulation)],
pou_net[, .(target_gene)][, regulation := "network"]
)))
pou_dt[, regulation := factor(regulation, levels = c("network", "up", "activated", "down", "repressed"))]
pou_dt[, regulation := paste(as.character(sort(unique(.SD$regulation))), collapse = "+"), target_gene]
# combine
all_dt <- merge.data.table(all_dt, pou_dt, by = "target_gene", all.y = TRUE)
all_dt[, sample := NULL]
sum_dt <- all_dt[,length(unique(.SD$target_gene)),regulation][order(-V1)][V1>1]
sub_dt <- all_dt[regulation %in% sum_dt$regulation]
sub_dt[, regulation := factor(regulation, levels = unique(sum_dt$regulation))]
# plot
pou_cols <- c(
"network" = "#ffecb3ff",
"network+activated+down" = "#b1b1dcff",
"network+activated" = "#b1b1dcff",
"network+down" = "#b1b1dcff",
"activated+down" = "#475ee6ff",
"activated" = "#475ee6ff",
"down" = "#add8e6ff",
"network+up" = "#fbbea6ff",
"network+up+repressed" = "#fbbea6ff",
"up" = "#f08080ff",
"up+repressed" = "#e85962ff",
"repressed" = "#de2346ff"
)
sub_dt[, regulation := factor(regulation, levels = names(pou_cols))]
sub_dt[target_expression_lfc < 0, target_expression_lfc := 0]
sub_dt[is.na(target_expression_lfc), target_expression_lfc := 0]
sub_dt[is.na(footprint_score), footprint_score := 0]
gp_pou_diff <- ggplot(sub_dt, aes(target_expression_lfc, footprint_score, color = regulation)) +
geom_point() +
scale_color_manual(values = pou_cols) +
geom_hline(yintercept = 4, linetype = 2) +
geom_vline(xintercept = 0, linetype = 2) +
facet_wrap("regulation") +
theme(legend.position = "bottom")
gp_pou_diff
How many non-network genes are bound by Pou4?
unique(sub_dt[regulation %in% c("activated", "down", "activated+down")][,.(regulation,target_gene,bound)])[,.N,bound]
## bound N
## 1: NA 334
## 2: FALSE 145
## 3: TRUE 7
unique(sub_dt[regulation %in% c("repressed", "up", "up+repressed")][,.(regulation,target_gene,bound)])[,.N,bound]
## bound N
## 1: NA 510
## 2: FALSE 82
## 3: TRUE 2
Cell type level expression for different geroups of Pou4 targets
## Loading required package: metacell
## Loading required package: tgconfig
## Loading required package: tgstat
## Loading required package: tglkmeans
## Warning in supportsMulticoreAndRStudio(...): [ONE-TIME WARNING] Forked
## processing ('multicore') is not supported when running R from RStudio because
## it is considered unstable. For more details, how to control forked processing
## or not, and how to silence this warning in future R sessions, see
## ?parallelly::supportsMulticore
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:Biostrings':
##
## collapse, intersect, setdiff, setequal, union
## The following object is masked from 'package:XVector':
##
## slice
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:graph':
##
## union
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: RColorBrewer
## Loading required package: rasterpdf
##
## Attaching package: 'rasterpdf'
## The following object is masked from 'package:grDevices':
##
## dev.off
## Loading required package: vioplot
## Loading required package: sm
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:sm':
##
## muscle
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:patchwork':
##
## area
## initializing scdb to ../scRNAseq_nvec_v4/scdb/
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 2.3 GiB
Motifs